Load the Data


In [2]:
code_lines = sqlCtx.read.json(
    # Put the location to your data here
    'git_repos/*.json.gz',
)
code_lines = code_lines.repartition(300)

Create the Python2Vec Model


In [3]:
import re

def split_code(input):
    strs = ' '.join(input)
    patt = re.compile(ur"[\w]+", re.UNICODE) 
    return patt.findall(strs)

In [4]:
words = code_lines\
    .map(
        lambda (
            author,
            author_mail,
            author_time,
            author_timezone,
            comment,
            commit_id,
            committer,
            committer_mail,
            committer_time,
            committer_timezone,
            filename,
            line,
            line_num,
            repo_name,
            ):
            (line.split())
    )\
    .map(lambda line: [f.lower() for f in line])\
    .map(lambda line: split_code(line))\
    .filter(lambda line: line != [])

In [5]:
from pyspark.mllib.feature import Word2Vec

word2vec = Word2Vec()
word2vec.setMinCount(25) # Default 100
word2vec.setVectorSize(50) # Default 100
model = word2vec.fit(words)

Save the model

We save two copies: One JSON version that can be passed arround to other people, and a pickle version that you can use to load the model on your own machine.


In [7]:
import json

model_dict = {k:list(v) for k,v in dict(model.getVectors()).iteritems()}

with open("/tmp/py2vec_model.json", "w") as f:
    json.dump(model_dict, f, indent=4)

In [15]:
import cPickle as pickle
import numpy as np

model_dict = {k:np.array(list(v)) for k,v in dict(model.getVectors()).iteritems()}

with open("/tmp/py2vec_model.pkl", "wb") as f:
    pickle.dump(model_dict, f)

Load the model


In [16]:
with open("/tmp/py2vec_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)